import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import hvplot.pandas
from scipy import stats
from plotnine import *
from plotnine.data import mtcars
import warnings
warnings.filterwarnings('ignore')
#importing and reading the dataset
mv_ds=pd.read_csv("/Users/Divya Dubey/Downloads/movies.csv")
mv_ds.head(5)
| id | title | genres | original_language | overview | popularity | production_companies | release_date | budget | revenue | runtime | status | tagline | vote_average | vote_count | credits | keywords | poster_path | backdrop_path | recommendations | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 436270 | Black Adam | Action-Fantasy-Science Fiction | en | Nearly 5000 years after he was bestowed with t... | 11752.795 | New Line Cinema-Flynn Picture Company-Seven Bu... | 2022-10-19 | 200000000.0 | 368000000.0 | 125.0 | Released | The world needed a hero. It got Black Adam. | 7.292 | 2420.0 | Dwayne Johnson-Aldis Hodge-Noah Centineo-Sarah... | lightning-anti hero-superhero-based on comic-d... | /pFlaoHTZeyNkG83vxsAJiGzfSsa.jpg | /bQXAqRx2Fgc46uCVWgoPz5L5Dtr.jpg | 663712-49046-642721-963954-365297-887731-63113... |
| 1 | 724495 | The Woman King | Action-Drama-History | en | The story of the Agojie the all-female unit of... | 4957.725 | TriStar Pictures-Entertainment One-JuVee Produ... | 2022-09-15 | 50000000.0 | 91000000.0 | 135.0 | Released | Her reign begins. | 7.906 | 586.0 | Viola Davis-Thuso Mbedu-Lashana Lynch-John Boy... | africa-arranged marriage-warrior woman-based o... | /438QXt1E3WJWb3PqNniK0tAE5c1.jpg | /7zQJYV02yehWrQN6NjKsBorqUUS.jpg | 49046-436270-913290-619730-882598-913816-80093... |
| 2 | 829799 | Paradise City | Crime-Action-Thriller | en | Renegade bounty hunter Ryan Swan must carve hi... | 3133.802 | Arcana Studio-308 Enterprises-Yale Productions... | 2022-11-11 | 20000000.0 | 0.0 | 93.0 | Released | NaN | 6.153 | 36.0 | John Travolta-Bruce Willis-Blake Jenner-Praya ... | NaN | /uGuHHS9SWv7MrFhCH6zoGGd7DA8.jpg | /au4HUSWDRadIcl9CqySlw1kJMfo.jpg | 879444-1018494-1015724-1026706-1032427-945444-... |
| 3 | 792775 | Cop Secret | Comedy-Action-Thriller | is | When Bússi Iceland's toughest cop is forced to... | 2447.908 | Pegasus Pictures-Stöð 2-SamFilm | 2022-05-23 | 0.0 | 0.0 | 100.0 | Released | To solve this crime, they'll need to break all... | 6.375 | 28.0 | Auðunn Blöndal-Egill Einarsson-Steinunn Ólína ... | co-workers relationship-bank robbery-lgbt-crim... | /jnWyZsaCl3Ke6u6ReSmBRO8S1rX.jpg | /sUuzl04qNIYsnwCLQpZ2RSvXA1V.jpg | 639933-852448-823625 |
| 4 | 956101 | The Eighth Clause | Thriller | la | Kat and Borja appear to be a perfect couple bu... | 2259.303 | SDB Films-El Hombre Orquesta | 2022-04-29 | 0.0 | 0.0 | 0.0 | Released | NaN | 4.600 | 10.0 | Maite Perroni-Manuel Vega-Óscar Jaenada-Jessic... | NaN | /8tc8eMFAX2SDC1TRu987qFQy8Cl.jpg | /kLnqNE9Af5QHyvUxw8cDGhF1ilv.jpg | NaN |
#Shape of the dataset (no of rows and no of columns)
mv_ds.shape
(735388, 20)
#renaming vote_average to imdb_ratings
mv_ds.rename(columns = {'vote_average':'imdb_ratings'}, inplace = True)
mv_ds.head(10)
| id | title | genres | original_language | overview | popularity | production_companies | release_date | budget | revenue | runtime | status | tagline | imdb_ratings | vote_count | credits | keywords | poster_path | backdrop_path | recommendations | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 436270 | Black Adam | Action-Fantasy-Science Fiction | en | Nearly 5000 years after he was bestowed with t... | 11752.795 | New Line Cinema-Flynn Picture Company-Seven Bu... | 2022-10-19 | 200000000.0 | 368000000.0 | 125.0 | Released | The world needed a hero. It got Black Adam. | 7.292 | 2420.0 | Dwayne Johnson-Aldis Hodge-Noah Centineo-Sarah... | lightning-anti hero-superhero-based on comic-d... | /pFlaoHTZeyNkG83vxsAJiGzfSsa.jpg | /bQXAqRx2Fgc46uCVWgoPz5L5Dtr.jpg | 663712-49046-642721-963954-365297-887731-63113... |
| 1 | 724495 | The Woman King | Action-Drama-History | en | The story of the Agojie the all-female unit of... | 4957.725 | TriStar Pictures-Entertainment One-JuVee Produ... | 2022-09-15 | 50000000.0 | 91000000.0 | 135.0 | Released | Her reign begins. | 7.906 | 586.0 | Viola Davis-Thuso Mbedu-Lashana Lynch-John Boy... | africa-arranged marriage-warrior woman-based o... | /438QXt1E3WJWb3PqNniK0tAE5c1.jpg | /7zQJYV02yehWrQN6NjKsBorqUUS.jpg | 49046-436270-913290-619730-882598-913816-80093... |
| 2 | 829799 | Paradise City | Crime-Action-Thriller | en | Renegade bounty hunter Ryan Swan must carve hi... | 3133.802 | Arcana Studio-308 Enterprises-Yale Productions... | 2022-11-11 | 20000000.0 | 0.0 | 93.0 | Released | NaN | 6.153 | 36.0 | John Travolta-Bruce Willis-Blake Jenner-Praya ... | NaN | /uGuHHS9SWv7MrFhCH6zoGGd7DA8.jpg | /au4HUSWDRadIcl9CqySlw1kJMfo.jpg | 879444-1018494-1015724-1026706-1032427-945444-... |
| 3 | 792775 | Cop Secret | Comedy-Action-Thriller | is | When Bússi Iceland's toughest cop is forced to... | 2447.908 | Pegasus Pictures-Stöð 2-SamFilm | 2022-05-23 | 0.0 | 0.0 | 100.0 | Released | To solve this crime, they'll need to break all... | 6.375 | 28.0 | Auðunn Blöndal-Egill Einarsson-Steinunn Ólína ... | co-workers relationship-bank robbery-lgbt-crim... | /jnWyZsaCl3Ke6u6ReSmBRO8S1rX.jpg | /sUuzl04qNIYsnwCLQpZ2RSvXA1V.jpg | 639933-852448-823625 |
| 4 | 956101 | The Eighth Clause | Thriller | la | Kat and Borja appear to be a perfect couple bu... | 2259.303 | SDB Films-El Hombre Orquesta | 2022-04-29 | 0.0 | 0.0 | 0.0 | Released | NaN | 4.600 | 10.0 | Maite Perroni-Manuel Vega-Óscar Jaenada-Jessic... | NaN | /8tc8eMFAX2SDC1TRu987qFQy8Cl.jpg | /kLnqNE9Af5QHyvUxw8cDGhF1ilv.jpg | NaN |
| 5 | 505642 | Black Panther: Wakanda Forever | Action-Adventure-Science Fiction | en | Queen Ramonda Shuri M’Baku Okoye and the Dora ... | 2248.449 | Marvel Studios | 2022-11-09 | 250000000.0 | 733000000.0 | 162.0 | Released | Forever. | 7.547 | 1182.0 | Letitia Wright-Lupita Nyong'o-Danai Gurira-Win... | hero-sequel-superhero-based on comic-duringcre... | /ps2oKfhY6DL3alynlSqY97gHSsg.jpg | /xDMIl84Qo5Tsu62c9DGWhmPI67A.jpg | 436270-785084-928123-663712-555604-615952-7441... |
| 6 | 948276 | Lost Bullet 2 | Action-Drama-Thriller | fr | Having cleared his name genius mechanic Lino h... | 2229.672 | Versus Production-Nolita-Inoxy Films | 2022-11-10 | 0.0 | 0.0 | 98.0 | Released | NaN | 6.650 | 140.0 | Alban Lenoir-Stéfi Celma-Pascale Arbillot-Séba... | french film | /uAeZI1JJbLPq7Bu5dziH7emHeu7.jpg | /a64zCJnqOwHYdFHfdQFqQcxYSAz.jpg | NaN |
| 7 | 899294 | Frank and Penelope | Thriller-Horror-Crime | en | A tale of love and violence when a man on his ... | 2128.548 | NaN | 2022-06-03 | 0.0 | 0.0 | 112.0 | Released | Prey for love. | 7.500 | 37.0 | Kevin Dillon-Sean Patrick Flanery-Johnathon Sc... | NaN | /5NpXoAi3nEQkEgLO09nmotPfyNa.jpg | /eyiSLRh44SKKWIJ6bxWq8z1sscB.jpg | NaN |
| 8 | 872177 | Corrective Measures | Science Fiction-Action | en | Set in San Tiburon the world's most dangerous ... | 1940.324 | The Exchange-Tubi TV-Arcana Productions | 2022-04-29 | 0.0 | 0.0 | 106.0 | Released | Anarchy in the world's most dangerous prison. | 5.100 | 35.0 | Bruce Willis-Hayley Sales-Michael Rooker-Kat R... | based on comic-lockdown-prison measures | /aHFq9NMhavOL0jtQvmHQ1c5e0ya.jpg | /8Tr79lfoCkOYRg8SYwWit4OoQLi.jpg | 1001717-639933-648579-755566-526896-725201 |
| 9 | 774752 | The Guardians of the Galaxy Holiday Special | Comedy-Science Fiction-Adventure | en | On a mission to make Christmas unforgettable f... | 1916.450 | Marvel Studios-Troll Court Entertainment-Kevin... | 2022-11-25 | 0.0 | 0.0 | 45.0 | Released | The perfect present is a galaxy away. | 7.465 | 564.0 | Chris Pratt-Dave Bautista-Karen Gillan-Pom Kle... | holiday-celebrity-superhero-talking dog-saving... | /8dqXyslZ2hv49Oiob9UjlGSHSTR.jpg | /rfnmMYuZ6EKOBvQLp2wqP21v7sI.jpg | 1010705-35981-1015724-410113-15997-616820-3666... |
#Displaying the data type of the dataset attributes
mv_ds.dtypes
id int64 title object genres object original_language object overview object popularity float64 production_companies object release_date object budget float64 revenue float64 runtime float64 status object tagline object imdb_ratings float64 vote_count float64 credits object keywords object poster_path object backdrop_path object recommendations object dtype: object
#Checking for the missing values in the dataset
mv_ds.isna().any()
id False title True genres True original_language False overview True popularity False production_companies True release_date True budget False revenue False runtime True status False tagline True imdb_ratings False vote_count False credits True keywords True poster_path True backdrop_path True recommendations True dtype: bool
#No of the missing values in the dataset
mv_ds.isna().sum()
id 0 title 4 genres 217209 original_language 0 overview 119309 popularity 0 production_companies 393560 release_date 54900 budget 0 revenue 0 runtime 37082 status 0 tagline 626228 imdb_ratings 0 vote_count 0 credits 228937 keywords 523038 poster_path 193438 backdrop_path 511912 recommendations 696916 dtype: int64
#Let's check if there are any movies with same title
mv_ds['title'].duplicated().sum()
159732
#Let's check if there are any movies with same title and same release date
mv_ds[['title','release_date']].duplicated().sum()
75377
# lets get rid of the duplicate movies
mv_ds.drop_duplicates(subset=['title','release_date'], inplace=True)
mv_ds.shape
(660011, 20)
# there are movies that have 0 vote count, so we will consider only those which have at least more than 20 vote counts.
# filtering the movies
mv_ds1 = mv_ds[mv_ds.vote_count >= 20].reset_index()
mv_ds1.isnull().sum()
index 0 id 0 title 0 genres 178 original_language 0 overview 467 popularity 0 production_companies 3236 release_date 2 budget 0 revenue 0 runtime 15 status 0 tagline 20581 imdb_ratings 0 vote_count 0 credits 643 keywords 9664 poster_path 130 backdrop_path 2312 recommendations 15338 dtype: int64
mv_ds1.shape
(43834, 21)
#Dropping the columns from the dataset that is not needed
mv_ds1.drop('poster_path', axis=1, inplace=True)
mv_ds1.drop('backdrop_path', axis=1, inplace=True)
mv_ds1.drop('recommendations', axis=1, inplace=True)
mv_ds1.drop('keywords', axis=1, inplace=True)
mv_ds1.drop('tagline', axis=1, inplace=True)
mv_ds1.drop('credits', axis=1, inplace=True)
mv_ds1.drop('overview', axis=1, inplace=True)
mv_ds1.shape
(43834, 14)
mv_ds1.isnull().sum()
index 0 id 0 title 0 genres 178 original_language 0 popularity 0 production_companies 3236 release_date 2 budget 0 revenue 0 runtime 15 status 0 imdb_ratings 0 vote_count 0 dtype: int64
# We can remove the null values from the dataset where the count is less . so that we don't loose much data
mv_ds1.dropna(axis=0,subset=['release_date','genres','production_companies'],inplace=True)
mv_ds1.shape
(40554, 14)
#Replacing the runtime with the median of the value as the graph is right skewed
mv_ds1["runtime"].fillna(mv_ds1["runtime"].median(),inplace=True)
mv_ds1.head(5)
| index | id | title | genres | original_language | popularity | production_companies | release_date | budget | revenue | runtime | status | imdb_ratings | vote_count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 436270 | Black Adam | Action-Fantasy-Science Fiction | en | 11752.795 | New Line Cinema-Flynn Picture Company-Seven Bu... | 2022-10-19 | 200000000.0 | 368000000.0 | 125.0 | Released | 7.292 | 2420.0 |
| 1 | 1 | 724495 | The Woman King | Action-Drama-History | en | 4957.725 | TriStar Pictures-Entertainment One-JuVee Produ... | 2022-09-15 | 50000000.0 | 91000000.0 | 135.0 | Released | 7.906 | 586.0 |
| 2 | 2 | 829799 | Paradise City | Crime-Action-Thriller | en | 3133.802 | Arcana Studio-308 Enterprises-Yale Productions... | 2022-11-11 | 20000000.0 | 0.0 | 93.0 | Released | 6.153 | 36.0 |
| 3 | 3 | 792775 | Cop Secret | Comedy-Action-Thriller | is | 2447.908 | Pegasus Pictures-Stöð 2-SamFilm | 2022-05-23 | 0.0 | 0.0 | 100.0 | Released | 6.375 | 28.0 |
| 4 | 5 | 505642 | Black Panther: Wakanda Forever | Action-Adventure-Science Fiction | en | 2248.449 | Marvel Studios | 2022-11-09 | 250000000.0 | 733000000.0 | 162.0 | Released | 7.547 | 1182.0 |
#No of the missing values in the dataset
mv_ds1.isna().sum()
index 0 id 0 title 0 genres 0 original_language 0 popularity 0 production_companies 0 release_date 0 budget 0 revenue 0 runtime 0 status 0 imdb_ratings 0 vote_count 0 dtype: int64
#Removing the duplicate values in the datset
mv_ds1.drop_duplicates(inplace=True)
mv_ds1.shape
(40554, 14)
# We need to replace the 0's value in runtime with the median of the value
mv_ds1['runtime'] = mv_ds1['runtime'].replace(0,mv_ds1['runtime'].median())
# We need to replace the 0's value in budget and revenue with the mean of the value
mv_ds1['budget'] = mv_ds1['budget'].replace(0,mv_ds1['budget'].mean())
mv_ds1['revenue'] = mv_ds1['revenue'].replace(0,mv_ds1['revenue'].mean())
mv_ds1.head(10)
| index | id | title | genres | original_language | popularity | production_companies | release_date | budget | revenue | runtime | status | imdb_ratings | vote_count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 436270 | Black Adam | Action-Fantasy-Science Fiction | en | 11752.795 | New Line Cinema-Flynn Picture Company-Seven Bu... | 2022-10-19 | 2.000000e+08 | 3.680000e+08 | 125.0 | Released | 7.292 | 2420.0 |
| 1 | 1 | 724495 | The Woman King | Action-Drama-History | en | 4957.725 | TriStar Pictures-Entertainment One-JuVee Produ... | 2022-09-15 | 5.000000e+07 | 9.100000e+07 | 135.0 | Released | 7.906 | 586.0 |
| 2 | 2 | 829799 | Paradise City | Crime-Action-Thriller | en | 3133.802 | Arcana Studio-308 Enterprises-Yale Productions... | 2022-11-11 | 2.000000e+07 | 1.647994e+07 | 93.0 | Released | 6.153 | 36.0 |
| 3 | 3 | 792775 | Cop Secret | Comedy-Action-Thriller | is | 2447.908 | Pegasus Pictures-Stöð 2-SamFilm | 2022-05-23 | 6.066852e+06 | 1.647994e+07 | 100.0 | Released | 6.375 | 28.0 |
| 4 | 5 | 505642 | Black Panther: Wakanda Forever | Action-Adventure-Science Fiction | en | 2248.449 | Marvel Studios | 2022-11-09 | 2.500000e+08 | 7.330000e+08 | 162.0 | Released | 7.547 | 1182.0 |
| 5 | 6 | 948276 | Lost Bullet 2 | Action-Drama-Thriller | fr | 2229.672 | Versus Production-Nolita-Inoxy Films | 2022-11-10 | 6.066852e+06 | 1.647994e+07 | 98.0 | Released | 6.650 | 140.0 |
| 7 | 8 | 872177 | Corrective Measures | Science Fiction-Action | en | 1940.324 | The Exchange-Tubi TV-Arcana Productions | 2022-04-29 | 6.066852e+06 | 1.647994e+07 | 106.0 | Released | 5.100 | 35.0 |
| 8 | 9 | 774752 | The Guardians of the Galaxy Holiday Special | Comedy-Science Fiction-Adventure | en | 1916.450 | Marvel Studios-Troll Court Entertainment-Kevin... | 2022-11-25 | 6.066852e+06 | 1.647994e+07 | 45.0 | Released | 7.465 | 564.0 |
| 9 | 10 | 846778 | Margaux | Horror-Science Fiction | en | 1751.341 | Motion Picture Corporation of America-Lighthou... | 2022-09-09 | 6.066852e+06 | 1.647994e+07 | 105.0 | Released | 6.800 | 43.0 |
| 10 | 11 | 830784 | Lyle, Lyle, Crocodile | Comedy-Family-Music | en | 1710.176 | Columbia Pictures-Eagle Pictures-TSG Entertain... | 2022-10-07 | 5.000000e+07 | 7.976194e+07 | 106.0 | Released | 7.810 | 124.0 |
#Creating a new column to check the net profit made by the production_companies (revenue-budget)
mv_ds1["Profit"]=mv_ds1['revenue'].sub(mv_ds1['budget'], axis = 0)
mv_ds1
| index | id | title | genres | original_language | popularity | production_companies | release_date | budget | revenue | runtime | status | imdb_ratings | vote_count | Profit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 436270 | Black Adam | Action-Fantasy-Science Fiction | en | 11752.795 | New Line Cinema-Flynn Picture Company-Seven Bu... | 2022-10-19 | 2.000000e+08 | 3.680000e+08 | 125.0 | Released | 7.292 | 2420.0 | 1.680000e+08 |
| 1 | 1 | 724495 | The Woman King | Action-Drama-History | en | 4957.725 | TriStar Pictures-Entertainment One-JuVee Produ... | 2022-09-15 | 5.000000e+07 | 9.100000e+07 | 135.0 | Released | 7.906 | 586.0 | 4.100000e+07 |
| 2 | 2 | 829799 | Paradise City | Crime-Action-Thriller | en | 3133.802 | Arcana Studio-308 Enterprises-Yale Productions... | 2022-11-11 | 2.000000e+07 | 1.647994e+07 | 93.0 | Released | 6.153 | 36.0 | -3.520059e+06 |
| 3 | 3 | 792775 | Cop Secret | Comedy-Action-Thriller | is | 2447.908 | Pegasus Pictures-Stöð 2-SamFilm | 2022-05-23 | 6.066852e+06 | 1.647994e+07 | 100.0 | Released | 6.375 | 28.0 | 1.041309e+07 |
| 4 | 5 | 505642 | Black Panther: Wakanda Forever | Action-Adventure-Science Fiction | en | 2248.449 | Marvel Studios | 2022-11-09 | 2.500000e+08 | 7.330000e+08 | 162.0 | Released | 7.547 | 1182.0 | 4.830000e+08 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 43829 | 584853 | 174323 | G.B.F. | Comedy-Drama | en | 0.600 | School Pictures-Parting Shots Media-Logolite E... | 2013-04-19 | 3.200000e+06 | 1.647994e+07 | 93.0 | Released | 6.000 | 366.0 | 1.327994e+07 |
| 43830 | 587770 | 182219 | Serial Teachers | Comedy | fr | 0.600 | UGC | 2013-04-17 | 1.200000e+07 | 1.647994e+07 | 88.0 | Released | 5.470 | 1109.0 | 4.479941e+06 |
| 43831 | 609313 | 184374 | Cinco de Mayo: The Battle | War-History-Drama | es | 0.600 | Estudios Churubusco Azteca-Gala Films-Gobierno... | 2013-05-03 | 1.000000e+07 | 1.647994e+07 | 125.0 | Released | 7.700 | 67.0 | 6.479941e+06 |
| 43832 | 687240 | 510819 | Dirty Dead Con Men | Action-Crime-Drama | en | 0.600 | Rock n' Tape Films-Thunder Alley Productions-N... | 2018-03-30 | 6.066852e+06 | 1.647994e+07 | 85.0 | Released | 4.000 | 20.0 | 1.041309e+07 |
| 43833 | 711541 | 505039 | Illicit Desires | Thriller | en | 0.600 | Retromedia Entertainment | 2018-04-03 | 6.066852e+06 | 1.647994e+07 | 81.0 | Released | 4.500 | 20.0 | 1.041309e+07 |
40554 rows × 15 columns
#Creating a new column to check the profit percentage made by the company
mv_ds1['Profit_Percentage']=(mv_ds1['Profit']/mv_ds1['budget'])*100
mv_ds1
| index | id | title | genres | original_language | popularity | production_companies | release_date | budget | revenue | runtime | status | imdb_ratings | vote_count | Profit | Profit_Percentage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 436270 | Black Adam | Action-Fantasy-Science Fiction | en | 11752.795 | New Line Cinema-Flynn Picture Company-Seven Bu... | 2022-10-19 | 2.000000e+08 | 3.680000e+08 | 125.0 | Released | 7.292 | 2420.0 | 1.680000e+08 | 84.000000 |
| 1 | 1 | 724495 | The Woman King | Action-Drama-History | en | 4957.725 | TriStar Pictures-Entertainment One-JuVee Produ... | 2022-09-15 | 5.000000e+07 | 9.100000e+07 | 135.0 | Released | 7.906 | 586.0 | 4.100000e+07 | 82.000000 |
| 2 | 2 | 829799 | Paradise City | Crime-Action-Thriller | en | 3133.802 | Arcana Studio-308 Enterprises-Yale Productions... | 2022-11-11 | 2.000000e+07 | 1.647994e+07 | 93.0 | Released | 6.153 | 36.0 | -3.520059e+06 | -17.600296 |
| 3 | 3 | 792775 | Cop Secret | Comedy-Action-Thriller | is | 2447.908 | Pegasus Pictures-Stöð 2-SamFilm | 2022-05-23 | 6.066852e+06 | 1.647994e+07 | 100.0 | Released | 6.375 | 28.0 | 1.041309e+07 | 171.639059 |
| 4 | 5 | 505642 | Black Panther: Wakanda Forever | Action-Adventure-Science Fiction | en | 2248.449 | Marvel Studios | 2022-11-09 | 2.500000e+08 | 7.330000e+08 | 162.0 | Released | 7.547 | 1182.0 | 4.830000e+08 | 193.200000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 43829 | 584853 | 174323 | G.B.F. | Comedy-Drama | en | 0.600 | School Pictures-Parting Shots Media-Logolite E... | 2013-04-19 | 3.200000e+06 | 1.647994e+07 | 93.0 | Released | 6.000 | 366.0 | 1.327994e+07 | 414.998153 |
| 43830 | 587770 | 182219 | Serial Teachers | Comedy | fr | 0.600 | UGC | 2013-04-17 | 1.200000e+07 | 1.647994e+07 | 88.0 | Released | 5.470 | 1109.0 | 4.479941e+06 | 37.332841 |
| 43831 | 609313 | 184374 | Cinco de Mayo: The Battle | War-History-Drama | es | 0.600 | Estudios Churubusco Azteca-Gala Films-Gobierno... | 2013-05-03 | 1.000000e+07 | 1.647994e+07 | 125.0 | Released | 7.700 | 67.0 | 6.479941e+06 | 64.799409 |
| 43832 | 687240 | 510819 | Dirty Dead Con Men | Action-Crime-Drama | en | 0.600 | Rock n' Tape Films-Thunder Alley Productions-N... | 2018-03-30 | 6.066852e+06 | 1.647994e+07 | 85.0 | Released | 4.000 | 20.0 | 1.041309e+07 | 171.639059 |
| 43833 | 711541 | 505039 | Illicit Desires | Thriller | en | 0.600 | Retromedia Entertainment | 2018-04-03 | 6.066852e+06 | 1.647994e+07 | 81.0 | Released | 4.500 | 20.0 | 1.041309e+07 | 171.639059 |
40554 rows × 16 columns
#Most of the values for the status is Released we can drop the status column
mv_ds1.drop('status',axis=1,inplace=True)
mv_ds1.shape
mv_ds1.head(5)
| index | id | title | genres | original_language | popularity | production_companies | release_date | budget | revenue | runtime | imdb_ratings | vote_count | Profit | Profit_Percentage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 436270 | Black Adam | Action-Fantasy-Science Fiction | en | 11752.795 | New Line Cinema-Flynn Picture Company-Seven Bu... | 2022-10-19 | 2.000000e+08 | 3.680000e+08 | 125.0 | 7.292 | 2420.0 | 1.680000e+08 | 84.000000 |
| 1 | 1 | 724495 | The Woman King | Action-Drama-History | en | 4957.725 | TriStar Pictures-Entertainment One-JuVee Produ... | 2022-09-15 | 5.000000e+07 | 9.100000e+07 | 135.0 | 7.906 | 586.0 | 4.100000e+07 | 82.000000 |
| 2 | 2 | 829799 | Paradise City | Crime-Action-Thriller | en | 3133.802 | Arcana Studio-308 Enterprises-Yale Productions... | 2022-11-11 | 2.000000e+07 | 1.647994e+07 | 93.0 | 6.153 | 36.0 | -3.520059e+06 | -17.600296 |
| 3 | 3 | 792775 | Cop Secret | Comedy-Action-Thriller | is | 2447.908 | Pegasus Pictures-Stöð 2-SamFilm | 2022-05-23 | 6.066852e+06 | 1.647994e+07 | 100.0 | 6.375 | 28.0 | 1.041309e+07 | 171.639059 |
| 4 | 5 | 505642 | Black Panther: Wakanda Forever | Action-Adventure-Science Fiction | en | 2248.449 | Marvel Studios | 2022-11-09 | 2.500000e+08 | 7.330000e+08 | 162.0 | 7.547 | 1182.0 | 4.830000e+08 | 193.200000 |
#DATA VISUALIZATION
#Displaying Ratings counts
%matplotlib inline
import matplotlib.pyplot as plt
score=mv_ds1["imdb_ratings"]
font = {'fontname':'Arial', 'size':'14'}
title_font = { 'weight' : 'bold','size':'16'}
plt.hist(score, bins=20)
plt.title("Distribution of the IMDB ratings")
plt.show()
# Correlation with heat map
plt.figure(figsize=(8,6))
sns.heatmap(mv_ds1.corr(),annot=True,fmt='.2f', cmap='viridis')
<AxesSubplot:>
#Checking for the movies released year wise
mv_ds1['year'] = pd.DatetimeIndex(mv_ds1['release_date']).year
(ggplot(mv_ds1) # defining what data to use
+ aes(x='year') # defining what variable to use
+ geom_bar(size=20) # defining the type of plot to use
)
<ggplot: (150631231202)>
#Dropping release_date column
mv_ds1.drop('release_date',axis=1,inplace=True)
#Relationship between the imdb ratings and the profit made by the movie
ggplot(mv_ds1) +\
aes(x='imdb_ratings', y='Profit') +\
geom_line() +\
stat_smooth(colour='green', span=1)
<ggplot: (150631332060)>
#We can see that there is strong corelation between the imdb_score and the profit .
#The movies with high imdb rating have made more profit
#Lets check Relationship between imdb_ratings and profit percentage
ggplot(mv_ds1) +\
aes(x='imdb_ratings', y='Profit_Percentage') +\
geom_line() +\
stat_smooth(colour='green', span=1)
<ggplot: (150631360558)>
#Top 10 movies based on the profit they made
plt.figure(figsize=(7,6))
mv_ds1 = mv_ds1.sort_values(by ='Profit' , ascending=False)
mv_ds1_new = mv_ds1.head(10)
a=sns.pointplot(mv_ds1_new['Profit'], mv_ds1_new['budget'], hue=mv_ds1_new['title'])
a.set_xticklabels(a.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()
#Top 10 movies based on the imdb_ratings
plt.figure(figsize=(10,8))
mv_ds1 = mv_ds1.sort_values(by ='imdb_ratings' , ascending=False)
mv_ds1_new = mv_ds1.head(10)
a=sns.pointplot(mv_ds1_new['id'], mv_ds1_new['imdb_ratings'], hue=mv_ds1_new['title'])
a.set_xticklabels(a.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()
mv_ds1.head(10)
| index | id | title | genres | original_language | popularity | production_companies | budget | revenue | runtime | imdb_ratings | vote_count | Profit | Profit_Percentage | year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 40444 | 77710 | 571278 | Scooby-Doo's Original Mysteries | Animation-Adventure | en | 2.538 | Hanna-Barbera Productions | 6.066852e+06 | 1.647994e+07 | 110.0 | 10.0 | 23.0 | 1.041309e+07 | 171.639059 | 2000 |
| 23432 | 28815 | 392622 | What's New, Scooby-Doo? Vol. 7: Ready to Scare | Animation-Comedy-Family-Mystery | en | 6.206 | Hanna-Barbera Productions-Cartoon Network Studios | 6.066852e+06 | 1.647994e+07 | 97.0 | 10.0 | 42.0 | 1.041309e+07 | 171.639059 | 2006 |
| 39167 | 69069 | 638440 | Scooby-Doo: Agence toutou risques, vol. 2 : Le... | Animation-Family-Comedy | fr | 2.819 | Hanna-Barbera Productions | 6.066852e+06 | 1.647994e+07 | 97.0 | 10.0 | 21.0 | 1.041309e+07 | 171.639059 | 2007 |
| 35498 | 53109 | 495686 | What's New Scooby-Doo? Vol. 4: Merry Scary Hol... | Animation-Comedy-Science Fiction-Action-Family | en | 3.558 | Warner Bros. Pictures-Hanna-Barbera Production... | 6.066852e+06 | 1.647994e+07 | 85.0 | 10.0 | 41.0 | 1.041309e+07 | 171.639059 | 2007 |
| 36231 | 55602 | 609737 | The 1st 13th Annual Fancy Anvil Awards Show Pr... | Animation-Family-Comedy | en | 3.414 | Cartoon Network-Hanna-Barbera Productions-Cart... | 6.066852e+06 | 1.647994e+07 | 120.0 | 10.0 | 35.0 | 1.041309e+07 | 171.639059 | 2002 |
| 21769 | 26422 | 414119 | What's New Scooby-Doo? Vol. 3: Halloween Boos ... | Animation-Comedy-Family-Mystery | en | 6.702 | Hanna-Barbera Productions-Cartoon Network Studios | 6.066852e+06 | 1.647994e+07 | 84.0 | 10.0 | 44.0 | 1.041309e+07 | 171.639059 | 2007 |
| 42131 | 96325 | 638443 | Scooby-Doo: Agence toutou risques, vol. 1 : Le... | Animation-Family-Comedy | fr | 2.064 | Hanna-Barbera Productions | 6.066852e+06 | 1.647994e+07 | 97.0 | 10.0 | 21.0 | 1.041309e+07 | 171.639059 | 2007 |
| 34409 | 49786 | 405794 | Cartoon Network Christmas: Yuletide Follies | Animation-Family-Comedy | en | 3.765 | Turner Home Entertainment-Cartoon Network-Hann... | 6.066852e+06 | 1.647994e+07 | 110.0 | 9.9 | 40.0 | 1.041309e+07 | 171.639059 | 2004 |
| 40842 | 80914 | 642488 | Mickey's Safety Club: Street Safe, Street Smart | Family-Animation-Comedy-Music-TV Movie | en | 2.440 | Disney Educational Productions-Walt Disney Stu... | 6.066852e+06 | 1.647994e+07 | 13.0 | 9.9 | 30.0 | 1.041309e+07 | 171.639059 | 1989 |
| 25357 | 31600 | 386024 | What's New, Scooby-Doo? Vol. 7: Ghosts on the Go! | Animation-Comedy-Family-Mystery | en | 5.710 | Warner Bros. Pictures | 6.066852e+06 | 1.647994e+07 | 87.0 | 9.9 | 42.0 | 1.041309e+07 | 171.639059 | 2006 |
#Removing the Columns with names
mv_ds1.drop('title', axis=1, inplace=True)
mv_ds1.drop('production_companies', axis=1, inplace=True)
#Remove the linear dependant variables
mv_ds1.drop('Profit', axis=1, inplace=True)
mv_ds1.drop('Profit_Percentage', axis=1, inplace=True)
#Remove the column that is not needed
mv_ds1.drop('popularity', axis=1, inplace=True)
mv_ds1.drop('index', axis=1, inplace=True)
mv_ds1.drop('id', axis=1, inplace=True)
value_counts=mv_ds1["original_language"].value_counts()
print(value_counts)
en 25102
fr 3438
it 1970
ja 1629
es 1465
...
iu 1
se 1
ne 1
qu 1
eo 1
Name: original_language, Length: 84, dtype: int64
vals = value_counts[:1].index
print (vals)
mv_ds1['original_language'] = mv_ds1.original_language.where(mv_ds1.original_language.isin(vals), 'other')
Index(['en'], dtype='object')
mv_ds1["original_language"].value_counts()
en 25102 other 15452 Name: original_language, dtype: int64
#Dropping vote count table
mv_ds1.drop('vote_count', axis=1, inplace=True)
mv_ds1.drop('genres', axis=1, inplace=True)
#Assigning dummies values to string data
mv_ds1 = pd.get_dummies(data = mv_ds1, columns = ['original_language'] , prefix = ['language'])
# We need to categorize the imdb values in the range of 0-4,4-6,6-8 and 8-10 to mark them as the bad,average,good and excellent movies respectively
mv_ds1["imdb_score"]=pd.cut(mv_ds1['imdb_ratings'], bins=[0,4,6,8,10], right=True, labels=False)+1
#Dropping the imdb_ratings column as it is being replaced with the imdb_score values
mv_ds1.drop('imdb_ratings',axis=1,inplace=True)
mv_ds1.head(5)
| budget | revenue | runtime | year | language_en | language_other | imdb_score | |
|---|---|---|---|---|---|---|---|
| 40444 | 6.066852e+06 | 1.647994e+07 | 110.0 | 2000 | 1 | 0 | 4 |
| 23432 | 6.066852e+06 | 1.647994e+07 | 97.0 | 2006 | 1 | 0 | 4 |
| 39167 | 6.066852e+06 | 1.647994e+07 | 97.0 | 2007 | 0 | 1 | 4 |
| 35498 | 6.066852e+06 | 1.647994e+07 | 85.0 | 2007 | 1 | 0 | 4 |
| 36231 | 6.066852e+06 | 1.647994e+07 | 120.0 | 2002 | 1 | 0 | 4 |
mv_ds1.isnull().sum()
budget 0 revenue 0 runtime 0 year 0 language_en 0 language_other 0 imdb_score 0 dtype: int64
mv_ds1.columns
Index(['budget', 'revenue', 'runtime', 'year', 'language_en', 'language_other',
'imdb_score'],
dtype='object')
#Splitting the data into training and test data
X=pd.DataFrame(columns=['runtime', 'budget', 'revenue', 'year', 'language_en'], data=mv_ds1)
y=pd.DataFrame(columns=['imdb_score'],data=mv_ds1)
#Create train and test data set
,
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.3,random_state=100)
#Feature scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
#Models
#KNN Model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
knn_model = KNeighborsClassifier()
#Fit the model with train data
knn_model.fit(X_train,y_train)
#Predict the target on train data set
knnpred = knn_model.predict(X_train)
print(knnpred)
train_knn = accuracy_score(y_train, knnpred)
print("\nAccuracy score on train dataset:", train_knn)
from sklearn import metrics
cnf_matrix_train = metrics.confusion_matrix(y_train, knnpred)
print("\nTrain Data Confusin Matrix:\n", cnf_matrix_train)
[2 3 2 ... 3 2 2] Accuracy score on train dataset: 0.7274456617465741 Train Data Confusin Matrix: [[ 84 279 168 0] [ 69 5479 3931 1] [ 51 2750 15061 12] [ 1 97 378 26]]
#Predict the target on train data set
knnpred1 = knn_model.predict(X_test)
print(knnpred1)
test_knn = accuracy_score(y_test, knnpred1)
print("\nAccuracy score on test dataset:", test_knn)
from sklearn import metrics
cnf_matrix_test = metrics.confusion_matrix(y_test, knnpred1)
print("\nTest Data Confusin Matrix:\n", cnf_matrix_test)
[3 3 2 ... 3 3 3] Accuracy score on test dataset: 0.6411605161502425 Test Data Confusin Matrix: [[ 13 135 80 0] [ 35 1831 2217 1] [ 30 1642 5953 12] [ 0 29 185 4]]
#Random Forest
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
#Fit the model with train data
rfc.fit(X_train,y_train)
#Predict the target on train data set
rfc_pred = rfc.predict(X_train)
print(rfc_pred)
train_rfc = accuracy_score(y_train, rfc_pred)
print("\nAccuracy score on train dataset:", train_rfc)
from sklearn import metrics
cnf_matrix_rfc = metrics.confusion_matrix(y_train, rfc_pred)
print("\nTrain Data Confusin Matrix:\n", cnf_matrix_rfc)
[2 3 2 ... 3 2 2] Accuracy score on train dataset: 0.8618733927502026 Train Data Confusin Matrix: [[ 246 167 118 0] [ 12 7540 1924 4] [ 10 1503 16355 6] [ 0 41 136 325]]
#Predict the target on test data set
rfc_pred1 = rfc.predict(X_test)
print(rfc_pred1)
test_rfc = accuracy_score(y_test, rfc_pred1)
print("\nAccuracy score on test dataset:", test_rfc)
from sklearn import metrics
cnf_matrix_rfc1 = metrics.confusion_matrix(y_test, rfc_pred1)
print("\nTest Data Confusin Matrix:\n", cnf_matrix_rfc1)
[3 3 3 ... 3 3 3] Accuracy score on test dataset: 0.6375441768718665 Test Data Confusin Matrix: [[ 6 133 88 1] [ 29 1783 2265 7] [ 29 1607 5965 36] [ 1 25 189 3]]
#Decision Tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
dtree = DecisionTreeClassifier()
#Fit the model with train data
dtree.fit(X_train,y_train)
#Predict the target on train data set
dtree_pred = dtree.predict(X_train)
print(dtree_pred)
train_dtree = accuracy_score(y_train, dtree_pred)
print("\nAccuracy score on train dataset:", train_dtree)
from sklearn import metrics
cnf_matrix_dtree = metrics.confusion_matrix(y_train, dtree_pred)
print("\nTrain Data Confusin Matrix:\n", cnf_matrix_dtree)
[2 2 2 ... 3 2 2] Accuracy score on train dataset: 0.8619086201430232 Train Data Confusin Matrix: [[ 302 143 86 0] [ 50 8063 1367 0] [ 43 2046 15784 1] [ 1 57 126 318]]
#Predict the target on test data set
dtree_pred1 = dtree.predict(X_test)
print(dtree_pred1)
test_dtree = accuracy_score(y_test, dtree_pred1)
print("\nAccuracy score on test dataset:", test_dtree)
from sklearn import metrics
cnf_matrix_dtree1 = metrics.confusion_matrix(y_test, dtree_pred1)
print("\nTest Data Confusin Matrix:\n", cnf_matrix_dtree1)
[3 3 2 ... 3 3 3] Accuracy score on test dataset: 0.5935727788279773 Test Data Confusin Matrix: [[ 13 135 78 2] [ 86 2007 1977 14] [ 73 2235 5188 141] [ 2 46 156 14]]
#Linear Regression
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
#Fit the model with train data
linreg.fit(X_train,y_train)
#Predict the target on train data set
linreg_pred = linreg.predict(X_train)
print(linreg_pred)
train_linreg = accuracy_score(y_train, linreg_pred.round())
print("\nAccuracy score on train dataset:", train_linreg)
from sklearn import metrics
cnf_matrix_linreg = metrics.confusion_matrix(y_train, linreg_pred.round())
print("\nTrain Data Confusin Matrix:\n", cnf_matrix_linreg)
[[2.5304214 ] [2.82431788] [2.4951687 ] ... [2.70776624] [2.721521 ] [2.6848867 ]] Accuracy score on train dataset: 0.6079543452989045 Train Data Confusin Matrix: [[ 0 5 526 0 0] [ 0 132 9348 0 0] [ 0 738 17120 16 0] [ 0 43 452 6 1] [ 0 0 0 0 0]]
#Predict the target on test data set
linreg_pred1 = linreg.predict(X_test)
print(linreg_pred1)
test_linreg = accuracy_score(y_test, linreg_pred1.round())
print("\nAccuracy score on test dataset:", test_linreg)
from sklearn import metrics
cnf_matrix_linreg1 = metrics.confusion_matrix(y_test, linreg_pred1.round())
print("\nTest Data Confusin Matrix:\n", cnf_matrix_linreg1)
[[2.61446715] [2.70317356] [2.62081408] ... [2.77506609] [2.55518263] [2.69697516]] Accuracy score on test dataset: 0.6073806197090491 Test Data Confusin Matrix: [[ 0 1 227 0 0] [ 0 60 4024 0 0] [ 0 300 7330 6 1] [ 0 19 199 0 0] [ 0 0 0 0 0]]
#Model Comparison
#classification report for train data on all models
from sklearn.metrics import classification_report
print('classification report for train data on all models\n')
print('KNN Reports\n',classification_report(y_train, knnpred))
print('Random Forests Reports\n',classification_report(y_train, rfc_pred))
print('Decision Tree Reports\n',classification_report(y_train, dtree_pred))
print('Linear Regression Reports\n',classification_report(y_train, linreg_pred.round()))
classification report for train data on all models
KNN Reports
precision recall f1-score support
1 0.41 0.16 0.23 531
2 0.64 0.58 0.61 9480
3 0.77 0.84 0.81 17874
4 0.67 0.05 0.10 502
accuracy 0.73 28387
macro avg 0.62 0.41 0.43 28387
weighted avg 0.72 0.73 0.72 28387
Random Forests Reports
precision recall f1-score support
1 0.92 0.46 0.62 531
2 0.82 0.80 0.81 9480
3 0.88 0.92 0.90 17874
4 0.97 0.65 0.78 502
accuracy 0.86 28387
macro avg 0.90 0.71 0.77 28387
weighted avg 0.86 0.86 0.86 28387
Decision Tree Reports
precision recall f1-score support
1 0.76 0.57 0.65 531
2 0.78 0.85 0.81 9480
3 0.91 0.88 0.90 17874
4 1.00 0.63 0.77 502
accuracy 0.86 28387
macro avg 0.86 0.73 0.78 28387
weighted avg 0.87 0.86 0.86 28387
Linear Regression Reports
precision recall f1-score support
1.0 0.00 0.00 0.00 531
2.0 0.14 0.01 0.03 9480
3.0 0.62 0.96 0.76 17874
4.0 0.27 0.01 0.02 502
5.0 0.00 0.00 0.00 0
accuracy 0.61 28387
macro avg 0.21 0.20 0.16 28387
weighted avg 0.45 0.61 0.48 28387
#classification report for test data on all models
from sklearn.metrics import classification_report
print('classification report for test data on all models\n')
print('KNN Reports\n',classification_report(y_test, knnpred1))
print('Random Forests Reports\n',classification_report(y_test, rfc_pred1))
print('Decision Tree Reports\n',classification_report(y_test, dtree_pred1))
print('Linear Regression Reports\n',classification_report(y_test, linreg_pred1.round()))
classification report for test data on all models
KNN Reports
precision recall f1-score support
1 0.17 0.06 0.08 228
2 0.50 0.45 0.47 4084
3 0.71 0.78 0.74 7637
4 0.24 0.02 0.03 218
accuracy 0.64 12167
macro avg 0.40 0.33 0.33 12167
weighted avg 0.62 0.64 0.63 12167
Random Forests Reports
precision recall f1-score support
1 0.09 0.03 0.04 228
2 0.50 0.44 0.47 4084
3 0.70 0.78 0.74 7637
4 0.06 0.01 0.02 218
accuracy 0.64 12167
macro avg 0.34 0.31 0.32 12167
weighted avg 0.61 0.64 0.62 12167
Decision Tree Reports
precision recall f1-score support
1 0.07 0.06 0.06 228
2 0.45 0.49 0.47 4084
3 0.70 0.68 0.69 7637
4 0.08 0.06 0.07 218
accuracy 0.59 12167
macro avg 0.33 0.32 0.32 12167
weighted avg 0.60 0.59 0.59 12167
Linear Regression Reports
precision recall f1-score support
1.0 0.00 0.00 0.00 228
2.0 0.16 0.01 0.03 4084
3.0 0.62 0.96 0.76 7637
4.0 0.00 0.00 0.00 218
5.0 0.00 0.00 0.00 0
accuracy 0.61 12167
macro avg 0.16 0.19 0.16 12167
weighted avg 0.44 0.61 0.48 12167